"""
Phase 5: Referee Fixes for Innovation Paper
============================================
Addresses referee comments on:
A) Log patents definition audit (9.4 log-point claim)
B) Bootstrap check on patents vs high-tech
C) Balanced panel pre/post-2000
D) Efficiency regression audit
E) Health mediation path check
F) FDI winsorization
G) Country FE + Year FE robustness for high-tech exports
"""

import sys
sys.path.insert(0, '/mnt/c/demographics_capital_flows/multilateral/src')
from model import PanelGLS
import pandas as pd
import numpy as np
from pathlib import Path

# ---------------------------------------------------------------------------
# Setup
# ---------------------------------------------------------------------------
DATA_DIR = Path('/mnt/c/demographics_capital_flows/innovation/data/processed')
OUT_DIR = Path('/mnt/c/demographics_capital_flows/innovation/output/tables')
OUT_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(DATA_DIR / 'innovation_panel.csv')
df = df[df['year'] <= 2024].copy()
print(f"Panel loaded: {df.shape[0]} obs, {df['iso3'].nunique()} countries, years {df['year'].min()}-{df['year'].max()}")
print(f"Columns ({len(df.columns)}): {list(df.columns)[:20]} ...")

controls = ['rgdp_growth', 'kaopen']
z_vars = ['Z_1', 'Z_2', 'Z_3']


def run_gls(dep, indep_vars, data, label=""):
    """Run PanelGLS, return results dict."""
    sub = data.dropna(subset=[dep] + indep_vars + ['iso3', 'year'])
    if len(sub) < 20:
        print(f"  [{label}] Too few obs: {len(sub)}")
        return None
    y = sub[dep].values
    X = sub[indep_vars].values
    gls = PanelGLS()
    gls.fit(y, X, sub['iso3'].values, sub['year'].values)
    gls.feature_names = indep_vars
    print(f"  [{label}] N={gls.n_obs}, countries={gls.n_countries}, R²={gls.r_squared:.4f}, ρ={gls.rho:.3f}")
    return gls


def fmt_sig(p):
    if p < 0.01: return '***'
    if p < 0.05: return '**'
    if p < 0.10: return '*'
    return ''


# =========================================================================
# PART A: Log Patents Definition Audit
# =========================================================================
print("\n" + "="*70)
print("PART A: Log Patents Definition Audit")
print("="*70)

print("\npatent_total stats:")
print(df['patent_total'].describe())
print(f"\nlog_patents stats:")
print(df['log_patents'].describe())

# Check if log_patents = ln(patent_total), ln(1+patent_total), or log10
sub_a = df.dropna(subset=['log_patents', 'patent_total'])
sub_a = sub_a[sub_a['patent_total'] > 0]

check_ln = np.log(sub_a['patent_total'].values[:10])
check_ln1p = np.log(1 + sub_a['patent_total'].values[:10])
check_log10 = np.log10(sub_a['patent_total'].values[:10])
actual = sub_a['log_patents'].values[:10]

print("\nDefinition check (first 10 non-zero obs):")
print(f"  actual log_patents: {actual[:5]}")
print(f"  ln(patents):        {check_ln[:5]}")
print(f"  ln(1+patents):      {check_ln1p[:5]}")
print(f"  log10(patents):     {check_log10[:5]}")

ln_match = np.allclose(actual, check_ln, atol=0.01)
ln1p_match = np.allclose(actual, check_ln1p, atol=0.01)
log10_match = np.allclose(actual, check_log10, atol=0.01)
print(f"\n  ln match: {ln_match}, ln(1+x) match: {ln1p_match}, log10 match: {log10_match}")

if ln_match:
    definition = "ln(patents)"
elif ln1p_match:
    definition = "ln(1 + patents)"
elif log10_match:
    definition = "log10(patents)"
else:
    definition = "UNKNOWN - check manually"

print(f"  → log_patents is: {definition}")

# Within-country SD of Z_1
z1_within = df.groupby('iso3')['Z_1'].transform(lambda x: x - x.mean())
within_sd_z1 = z1_within.std()
print(f"\nWithin-country SD of Z_1: {within_sd_z1:.4f}")

# Coefficient interpretation
coef_z1 = -6.9098  # from baseline
delta_log = coef_z1 * within_sd_z1
pct_change = (np.exp(delta_log) - 1) * 100
print(f"\n1 within-SD of Z_1 ({within_sd_z1:.4f}):")
print(f"  → Δlog_patents = {coef_z1:.4f} × {within_sd_z1:.4f} = {delta_log:.4f}")
print(f"  → % change in patents = (exp({delta_log:.4f}) - 1) × 100 = {pct_change:.1f}%")

# For a typical 20-year aging window
delta_z1_20yr = 0.27
delta_log_20yr = coef_z1 * delta_z1_20yr
pct_change_20yr = (np.exp(delta_log_20yr) - 1) * 100
print(f"\n20-year aging (ΔZ_1 ≈ {delta_z1_20yr}):")
print(f"  → Δlog_patents = {coef_z1:.4f} × {delta_z1_20yr} = {delta_log_20yr:.4f}")
print(f"  → % change in patents = {pct_change_20yr:.1f}%")

# Save Part A
with open(OUT_DIR / 'referee_log_audit.md', 'w') as f:
    f.write("# Referee Response: Log Patents Definition Audit\n\n")
    f.write(f"## Definition\n")
    f.write(f"- `log_patents` = **{definition}**\n")
    f.write(f"- patent_total: mean={df['patent_total'].mean():.0f}, median={df['patent_total'].median():.0f}, "
            f"min={df['patent_total'].min():.0f}, max={df['patent_total'].max():.0f}\n")
    f.write(f"- log_patents: mean={df['log_patents'].mean():.3f}, SD={df['log_patents'].std():.3f}\n\n")
    f.write(f"## Within-Country Variation\n")
    f.write(f"- Within-country SD of Z_1: **{within_sd_z1:.4f}**\n\n")
    f.write(f"## Economic Interpretation of Z_1 Coefficient (-6.91)\n\n")
    f.write(f"| Scenario | ΔZ_1 | Δlog(patents) | % Δ patents |\n")
    f.write(f"|---|---|---|---|\n")
    f.write(f"| 1 within-SD | {within_sd_z1:.4f} | {delta_log:.3f} | {pct_change:.1f}% |\n")
    f.write(f"| 20-year aging | {delta_z1_20yr} | {delta_log_20yr:.3f} | {pct_change_20yr:.1f}% |\n\n")
    f.write(f"The coefficient of -6.91 is NOT \"9.4 log-points\" for any plausible demographic shift. "
            f"A 1 within-SD increase in Z_1 ({within_sd_z1:.3f}) implies a {abs(pct_change):.0f}% change "
            f"in patent counts. Over a 20-year aging window (ΔZ_1 ≈ 0.27), the implied change is {pct_change_20yr:.1f}%.\n")
    f.write(f"\n*Note: The referee may have multiplied the coefficient by a 1-unit change in Z_1, "
            f"which is ~{1/within_sd_z1:.0f}× the within-country SD and outside observed variation.*\n")

print(f"\n→ Saved: {OUT_DIR / 'referee_log_audit.md'}")

# =========================================================================
# PART B: Bootstrap Check on Patents vs High-Tech
# =========================================================================
print("\n" + "="*70)
print("PART B: Bootstrap Check (200 iterations, cluster by country)")
print("="*70)

# Existing bootstrap results are available - re-run with 200 iterations for referee
np.random.seed(42)
n_boot = 200

dep_vars_boot = {
    'log_patents': 'log_patents',
    'hightech_exports_share': 'hightech_exports_share',
    'patents_per_million': 'patents_per_million',
}

boot_results = {}
for dep_label, dep_var in dep_vars_boot.items():
    print(f"\n  Bootstrapping: Z → {dep_label}")
    sub = df.dropna(subset=[dep_var] + z_vars + controls + ['iso3', 'year']).copy()
    countries = sub['iso3'].unique()
    n_c = len(countries)

    # Baseline
    gls_base = PanelGLS()
    gls_base.fit(sub[dep_var].values, sub[z_vars + controls].values,
                 sub['iso3'].values, sub['year'].values)
    base_coefs = gls_base.beta[:3]  # Z_1, Z_2, Z_3
    base_se = gls_base.se[:3]

    boot_coefs = np.zeros((n_boot, 3))
    n_fail = 0
    for b in range(n_boot):
        # Resample countries with replacement
        boot_countries = np.random.choice(countries, size=n_c, replace=True)
        boot_frames = []
        for i, c in enumerate(boot_countries):
            chunk = sub[sub['iso3'] == c].copy()
            chunk['iso3'] = f"{c}_{i}"  # make unique
            boot_frames.append(chunk)
        boot_df = pd.concat(boot_frames, ignore_index=True)

        try:
            gls_b = PanelGLS()
            gls_b.fit(boot_df[dep_var].values, boot_df[z_vars + controls].values,
                      boot_df['iso3'].values, boot_df['year'].values)
            boot_coefs[b] = gls_b.beta[:3]
        except:
            boot_coefs[b] = np.nan
            n_fail += 1

    boot_coefs_clean = boot_coefs[~np.isnan(boot_coefs[:, 0])]
    boot_se_vals = np.std(boot_coefs_clean, axis=0)
    ci_lo = np.percentile(boot_coefs_clean, 2.5, axis=0)
    ci_hi = np.percentile(boot_coefs_clean, 97.5, axis=0)
    # Bootstrap p-value: fraction of bootstrap samples with opposite sign
    boot_p = np.array([np.mean(boot_coefs_clean[:, j] > 0) if base_coefs[j] < 0
                       else np.mean(boot_coefs_clean[:, j] < 0)
                       for j in range(3)]) * 2  # two-sided
    boot_p = np.clip(boot_p, 0, 1)

    boot_results[dep_label] = {
        'base_coefs': base_coefs, 'base_se': base_se,
        'boot_se': boot_se_vals, 'ci_lo': ci_lo, 'ci_hi': ci_hi,
        'boot_p': boot_p, 'n_obs': gls_base.n_obs, 'n_c': gls_base.n_countries,
        'n_fail': n_fail
    }

    for j, v in enumerate(['Z_1', 'Z_2', 'Z_3']):
        print(f"    {v}: coef={base_coefs[j]:.4f}, analytic_SE={base_se[j]:.4f}, "
              f"boot_SE={boot_se_vals[j]:.4f}, boot_p={boot_p[j]:.4f}, "
              f"95%CI=[{ci_lo[j]:.4f}, {ci_hi[j]:.4f}]")

# Save Part B
with open(OUT_DIR / 'referee_bootstrap_check.md', 'w') as f:
    f.write("# Referee Response: Bootstrap Standard Errors (200 iterations, cluster by country)\n\n")
    f.write("| Dep Var | Variable | Coef | Analytic SE | Bootstrap SE | Boot 2.5% | Boot 97.5% | Boot p | Sig |\n")
    f.write("|---|---|---|---|---|---|---|---|---|\n")
    for dep_label, res in boot_results.items():
        for j, v in enumerate(['Z_1', 'Z_2', 'Z_3']):
            sig = fmt_sig(res['boot_p'][j])
            f.write(f"| {dep_label} | {v} | {res['base_coefs'][j]:.4f} | "
                    f"{res['base_se'][j]:.4f} | {res['boot_se'][j]:.4f} | "
                    f"{res['ci_lo'][j]:.4f} | {res['ci_hi'][j]:.4f} | "
                    f"{res['boot_p'][j]:.4f} | {sig} |\n")
    f.write(f"\n*200 cluster-bootstrap iterations, resampling countries with replacement.*\n")

print(f"\n→ Saved: {OUT_DIR / 'referee_bootstrap_check.md'}")

# =========================================================================
# PART C: Balanced Panel Pre/Post-2000
# =========================================================================
print("\n" + "="*70)
print("PART C: Balanced Panel Pre/Post-2000")
print("="*70)

sub_c = df.dropna(subset=['log_patents'] + z_vars + controls + ['iso3', 'year']).copy()
pre = sub_c[sub_c['year'] < 2000]
post = sub_c[sub_c['year'] >= 2000]

countries_pre = set(pre['iso3'].unique())
countries_post = set(post['iso3'].unique())
balanced_countries = countries_pre & countries_post
print(f"Countries with data pre-2000: {len(countries_pre)}")
print(f"Countries with data post-2000: {len(countries_post)}")
print(f"Countries in BOTH periods: {len(balanced_countries)}")

bal = sub_c[sub_c['iso3'].isin(balanced_countries)]
bal_pre = bal[bal['year'] < 2000]
bal_post = bal[bal['year'] >= 2000]

print(f"\nBalanced panel: {len(balanced_countries)} countries")
print(f"  Pre-2000: {len(bal_pre)} obs")
print(f"  Post-2000: {len(bal_post)} obs")

results_c = {}
for label, data in [('Pre-2000 (balanced)', bal_pre), ('Post-2000 (balanced)', bal_post),
                     ('Full pre-2000', pre), ('Full post-2000', post)]:
    gls = run_gls('log_patents', z_vars + controls, data, label)
    if gls is not None:
        results_c[label] = gls

# Save Part C
with open(OUT_DIR / 'referee_balanced_patents.md', 'w') as f:
    f.write("# Referee Response: Balanced Panel Pre/Post-2000\n\n")
    f.write(f"Countries with patent data in BOTH pre-2000 AND post-2000: **{len(balanced_countries)}**\n\n")
    f.write("| Sample | N | Countries | R² | Z_1 coef | Z_1 SE | Z_1 p | Sig |\n")
    f.write("|---|---|---|---|---|---|---|---|\n")
    for label, gls in results_c.items():
        sig = fmt_sig(gls.pvalues[0])
        f.write(f"| {label} | {gls.n_obs} | {gls.n_countries} | {gls.r_squared:.4f} | "
                f"{gls.beta[0]:.4f} | {gls.se[0]:.4f} | {gls.pvalues[0]:.4f} | {sig} |\n")
    f.write(f"\n## Full Coefficients (Balanced Panel)\n\n")
    f.write("| Period | Variable | Coef | SE | p-value | Sig |\n")
    f.write("|---|---|---|---|---|---|\n")
    for label in ['Pre-2000 (balanced)', 'Post-2000 (balanced)']:
        if label in results_c:
            gls = results_c[label]
            names = z_vars + controls
            for j, v in enumerate(names):
                if j < len(gls.beta):
                    sig = fmt_sig(gls.pvalues[j])
                    f.write(f"| {label} | {v} | {gls.beta[j]:.4f} | {gls.se[j]:.4f} | {gls.pvalues[j]:.4f} | {sig} |\n")

    # Interpretation
    f.write(f"\n## Interpretation\n")
    if 'Pre-2000 (balanced)' in results_c and 'Post-2000 (balanced)' in results_c:
        pre_sign = np.sign(results_c['Pre-2000 (balanced)'].beta[0])
        post_sign = np.sign(results_c['Post-2000 (balanced)'].beta[0])
        if pre_sign != post_sign:
            f.write("Sign of Z_1 **flips** between pre- and post-2000 on the balanced panel, "
                    "suggesting a genuine structural break rather than compositional artifact.\n")
        else:
            f.write("Sign of Z_1 is **consistent** across periods on the balanced panel, "
                    "suggesting the time-break finding may reflect compositional changes in the sample.\n")

print(f"\n→ Saved: {OUT_DIR / 'referee_balanced_patents.md'}")

# =========================================================================
# PART D: Efficiency Regression Audit
# =========================================================================
print("\n" + "="*70)
print("PART D: Efficiency Regression Audit")
print("="*70)

# Define efficiency = log(patents_per_million / rd_expenditure_gdp)
sub_d = df.dropna(subset=['patents_per_million', 'rd_expenditure_gdp']).copy()
sub_d = sub_d[(sub_d['patents_per_million'] > 0) & (sub_d['rd_expenditure_gdp'] > 0)].copy()
sub_d['efficiency'] = np.log(sub_d['patents_per_million'] / sub_d['rd_expenditure_gdp'])
print(f"Efficiency variable: N={len(sub_d)}, countries={sub_d['iso3'].nunique()}")
print(f"  efficiency stats: mean={sub_d['efficiency'].mean():.3f}, SD={sub_d['efficiency'].std():.3f}")

# D1: Z → efficiency
print("\nD1: Z_1, Z_2, Z_3 → efficiency")
gls_d1 = run_gls('efficiency', z_vars + controls, sub_d, "Z → efficiency")

# D2: old_dep, youth_dep → efficiency
print("\nD2: old_dep, youth_dep → efficiency")
age_vars = ['old_dep', 'youth_dep']
gls_d2 = run_gls('efficiency', age_vars + controls, sub_d, "age_ratios → efficiency")

# Save Part D
with open(OUT_DIR / 'referee_efficiency_audit.md', 'w') as f:
    f.write("# Referee Response: Efficiency Regression Audit\n\n")
    f.write(f"Efficiency = log(patents_per_million / rd_expenditure_gdp)\n")
    f.write(f"- N with valid efficiency: {len(sub_d)}, countries: {sub_d['iso3'].nunique()}\n\n")

    f.write("## D1: Z_1, Z_2, Z_3 → Efficiency\n\n")
    if gls_d1:
        f.write(f"N = {gls_d1.n_obs}, Countries = {gls_d1.n_countries}, R² = {gls_d1.r_squared:.4f}, ρ = {gls_d1.rho:.3f}\n\n")
        f.write("| Variable | Coef | SE | p-value | Sig |\n")
        f.write("|---|---|---|---|---|\n")
        names = z_vars + controls
        for j, v in enumerate(names):
            if j < len(gls_d1.beta):
                sig = fmt_sig(gls_d1.pvalues[j])
                f.write(f"| {v} | {gls_d1.beta[j]:.4f} | {gls_d1.se[j]:.4f} | {gls_d1.pvalues[j]:.4f} | {sig} |\n")

    f.write("\n## D2: old_dep, youth_dep → Efficiency\n\n")
    if gls_d2:
        f.write(f"N = {gls_d2.n_obs}, Countries = {gls_d2.n_countries}, R² = {gls_d2.r_squared:.4f}, ρ = {gls_d2.rho:.3f}\n\n")
        f.write("| Variable | Coef | SE | p-value | Sig |\n")
        f.write("|---|---|---|---|---|\n")
        names = age_vars + controls
        for j, v in enumerate(names):
            if j < len(gls_d2.beta):
                sig = fmt_sig(gls_d2.pvalues[j])
                f.write(f"| {v} | {gls_d2.beta[j]:.4f} | {gls_d2.se[j]:.4f} | {gls_d2.pvalues[j]:.4f} | {sig} |\n")

    f.write("\n## Interpretation\n")
    if gls_d1 and gls_d2:
        z1_sig = gls_d1.pvalues[0] < 0.10
        old_sig = gls_d2.pvalues[0] < 0.10
        f.write(f"- Z_1 on efficiency: p = {gls_d1.pvalues[0]:.4f} → {'significant' if z1_sig else '**not significant**'}\n")
        f.write(f"- old_dep on efficiency: p = {gls_d2.pvalues[0]:.4f} → {'**significant**' if old_sig else 'not significant'}\n")
        if not z1_sig and old_sig:
            f.write("\nConfirmed: Z_1 is null on efficiency while old_dep is significant. "
                    "The PCA components capture different variation than the raw dependency ratio.\n")

print(f"\n→ Saved: {OUT_DIR / 'referee_efficiency_audit.md'}")

# =========================================================================
# PART E: Health Mediation Path Check
# =========================================================================
print("\n" + "="*70)
print("PART E: Health Mediation Path Check")
print("="*70)

# Path a: Z_1 → health_exp_gdp
print("\nPath a: Z_1 → health_exp_gdp")
gls_e1 = run_gls('health_exp_gdp', z_vars + controls, df, "Z → health_exp")

# Path b: health_exp_gdp → patents_per_million
print("\nPath b: health_exp_gdp → patents_per_million")
sub_e = df.dropna(subset=['patents_per_million', 'health_exp_gdp'] + controls + ['iso3', 'year']).copy()
gls_e2 = run_gls('patents_per_million', ['health_exp_gdp'] + controls, sub_e, "health → patents_pm")

# Also: health + Z → patents_per_million (to see if health mediates)
print("\nPath c (combined): Z + health → patents_per_million")
gls_e3 = run_gls('patents_per_million', z_vars + ['health_exp_gdp'] + controls, sub_e, "Z + health → patents_pm")

# Save Part E
with open(OUT_DIR / 'referee_health_paths.md', 'w') as f:
    f.write("# Referee Response: Health Mediation Path Check\n\n")

    f.write("## Path a: Z → health_exp_gdp\n\n")
    if gls_e1:
        f.write(f"N = {gls_e1.n_obs}, Countries = {gls_e1.n_countries}, R² = {gls_e1.r_squared:.4f}\n\n")
        f.write("| Variable | Coef | SE | p-value | Sig |\n")
        f.write("|---|---|---|---|---|\n")
        names = z_vars + controls
        for j, v in enumerate(names):
            if j < len(gls_e1.beta):
                sig = fmt_sig(gls_e1.pvalues[j])
                f.write(f"| {v} | {gls_e1.beta[j]:.4f} | {gls_e1.se[j]:.4f} | {gls_e1.pvalues[j]:.4f} | {sig} |\n")
        z1_path_a_sig = gls_e1.pvalues[0] < 0.10

    f.write("\n## Path b: health_exp_gdp → patents_per_million\n\n")
    if gls_e2:
        f.write(f"N = {gls_e2.n_obs}, Countries = {gls_e2.n_countries}, R² = {gls_e2.r_squared:.4f}\n\n")
        f.write("| Variable | Coef | SE | p-value | Sig |\n")
        f.write("|---|---|---|---|---|\n")
        names = ['health_exp_gdp'] + controls
        for j, v in enumerate(names):
            if j < len(gls_e2.beta):
                sig = fmt_sig(gls_e2.pvalues[j])
                f.write(f"| {v} | {gls_e2.beta[j]:.4f} | {gls_e2.se[j]:.4f} | {gls_e2.pvalues[j]:.4f} | {sig} |\n")
        health_path_b_sig = gls_e2.pvalues[0] < 0.10

    f.write("\n## Path c: Z + health → patents_per_million\n\n")
    if gls_e3:
        f.write(f"N = {gls_e3.n_obs}, Countries = {gls_e3.n_countries}, R² = {gls_e3.r_squared:.4f}\n\n")
        f.write("| Variable | Coef | SE | p-value | Sig |\n")
        f.write("|---|---|---|---|---|\n")
        names = z_vars + ['health_exp_gdp'] + controls
        for j, v in enumerate(names):
            if j < len(gls_e3.beta):
                sig = fmt_sig(gls_e3.pvalues[j])
                f.write(f"| {v} | {gls_e3.beta[j]:.4f} | {gls_e3.se[j]:.4f} | {gls_e3.pvalues[j]:.4f} | {sig} |\n")

    f.write("\n## Verdict\n")
    if gls_e1 and gls_e2:
        f.write(f"- Path a (Z_1 → health_exp): p = {gls_e1.pvalues[0]:.4f} → {'significant' if z1_path_a_sig else '**not significant**'}\n")
        f.write(f"- Path b (health → patents_pm): p = {gls_e2.pvalues[0]:.4f} → {'significant' if health_path_b_sig else '**not significant**'}\n")
        if not z1_path_a_sig and not health_path_b_sig:
            f.write("\n**Both paths are insignificant.** The mediation claim (53.6%) should be dropped or heavily qualified.\n")
        elif not z1_path_a_sig or not health_path_b_sig:
            f.write(f"\nOne path is insignificant. The mediation claim should be qualified.\n")
        else:
            f.write("\nBoth paths are significant. Mediation claim is supported.\n")

print(f"\n→ Saved: {OUT_DIR / 'referee_health_paths.md'}")

# =========================================================================
# PART F: FDI Winsorization
# =========================================================================
print("\n" + "="*70)
print("PART F: FDI Winsorization")
print("="*70)

for fdi_var in ['fdi_inflows_gdp', 'fdi_outflows_gdp']:
    sub_f = df[fdi_var].dropna()
    print(f"\n{fdi_var}: min={sub_f.min():.2f}, max={sub_f.max():.2f}, "
          f"p1={sub_f.quantile(0.01):.2f}, p99={sub_f.quantile(0.99):.2f}")

# Winsorize
df_w = df.copy()
for fdi_var in ['fdi_inflows_gdp', 'fdi_outflows_gdp']:
    col = df_w[fdi_var]
    p1 = col.quantile(0.01)
    p99 = col.quantile(0.99)
    df_w[f'{fdi_var}_w'] = col.clip(lower=p1, upper=p99)

# Baseline vs winsorized
fdi_results = {}
for fdi_var in ['fdi_inflows_gdp', 'fdi_outflows_gdp']:
    print(f"\n--- {fdi_var} ---")
    # Baseline
    gls_base = run_gls(fdi_var, z_vars + controls, df, f"Baseline {fdi_var}")
    fdi_results[f'{fdi_var}_base'] = gls_base

    # Winsorized
    gls_win = run_gls(f'{fdi_var}_w', z_vars + controls, df_w, f"Winsorized {fdi_var}")
    fdi_results[f'{fdi_var}_win'] = gls_win

# Save Part F
with open(OUT_DIR / 'referee_fdi_winsorized.md', 'w') as f:
    f.write("# Referee Response: FDI Winsorization (p1/p99)\n\n")

    for fdi_var in ['fdi_inflows_gdp', 'fdi_outflows_gdp']:
        sub_f = df[fdi_var].dropna()
        f.write(f"## {fdi_var}\n")
        f.write(f"- Raw: min={sub_f.min():.2f}, max={sub_f.max():.2f}\n")
        f.write(f"- p1={sub_f.quantile(0.01):.2f}, p99={sub_f.quantile(0.99):.2f}\n\n")

    f.write("## Results: Baseline vs Winsorized\n\n")
    f.write("| Dep Var | Version | N | R² | Z_1 coef | Z_1 SE | Z_1 p | Sig |\n")
    f.write("|---|---|---|---|---|---|---|---|\n")
    for fdi_var in ['fdi_inflows_gdp', 'fdi_outflows_gdp']:
        for suffix, label in [('_base', 'Raw'), ('_win', 'Winsorized')]:
            key = f'{fdi_var}{suffix}'
            gls = fdi_results[key]
            if gls:
                sig = fmt_sig(gls.pvalues[0])
                f.write(f"| {fdi_var} | {label} | {gls.n_obs} | {gls.r_squared:.4f} | "
                        f"{gls.beta[0]:.4f} | {gls.se[0]:.4f} | {gls.pvalues[0]:.4f} | {sig} |\n")

    f.write("\n## Full Coefficients (Winsorized)\n\n")
    f.write("| Dep Var | Variable | Coef | SE | p-value | Sig |\n")
    f.write("|---|---|---|---|---|---|\n")
    for fdi_var in ['fdi_inflows_gdp', 'fdi_outflows_gdp']:
        key = f'{fdi_var}_win'
        gls = fdi_results[key]
        if gls:
            names = z_vars + controls
            for j, v in enumerate(names):
                if j < len(gls.beta):
                    sig = fmt_sig(gls.pvalues[j])
                    f.write(f"| {fdi_var} | {v} | {gls.beta[j]:.4f} | {gls.se[j]:.4f} | {gls.pvalues[j]:.4f} | {sig} |\n")

print(f"\n→ Saved: {OUT_DIR / 'referee_fdi_winsorized.md'}")

# =========================================================================
# PART G: Country FE + Year FE Robustness (High-Tech Exports)
# =========================================================================
print("\n" + "="*70)
print("PART G: FE Robustness for High-Tech Exports")
print("="*70)

dep_g = 'hightech_exports_share'
sub_g = df.dropna(subset=[dep_g] + z_vars + controls + ['iso3', 'year']).copy()
print(f"Sample: {len(sub_g)} obs, {sub_g['iso3'].nunique()} countries")

# G1: Baseline PanelGLS
print("\nG1: Baseline PanelGLS")
gls_g1 = run_gls(dep_g, z_vars + controls, sub_g, "Baseline")

# G2: + year dummies
print("\nG2: + year dummies")
year_dummies = pd.get_dummies(sub_g['year'], prefix='yr', drop_first=True, dtype=float)
sub_g_yd = pd.concat([sub_g.reset_index(drop=True), year_dummies.reset_index(drop=True)], axis=1)
year_cols = list(year_dummies.columns)
gls_g2 = run_gls(dep_g, z_vars + controls + year_cols, sub_g_yd, "+ year dummies")

# G3: Country-demeaned + year dummies
print("\nG3: Country-demeaned + year dummies")
demean_vars = [dep_g] + z_vars + controls
for v in demean_vars:
    sub_g_yd[f'{v}_dm'] = sub_g_yd.groupby('iso3')[v].transform(lambda x: x - x.mean())

dm_z = [f'{v}_dm' for v in z_vars]
dm_controls = [f'{v}_dm' for v in controls]
gls_g3 = run_gls(f'{dep_g}_dm', dm_z + dm_controls + year_cols, sub_g_yd, "Demeaned + yr dummies")

# Save Part G
with open(OUT_DIR / 'referee_fe_robustness.md', 'w') as f:
    f.write("# Referee Response: FE Robustness for High-Tech Exports\n\n")
    f.write("| Model | N | Countries | R² | Z_1 coef | Z_1 SE | Z_1 p | Sig |\n")
    f.write("|---|---|---|---|---|---|---|---|\n")

    for label, gls in [("G1: Baseline", gls_g1), ("G2: + Year FE", gls_g2), ("G3: Country FE + Year FE", gls_g3)]:
        if gls:
            sig = fmt_sig(gls.pvalues[0])
            f.write(f"| {label} | {gls.n_obs} | {gls.n_countries} | {gls.r_squared:.4f} | "
                    f"{gls.beta[0]:.4f} | {gls.se[0]:.4f} | {gls.pvalues[0]:.4f} | {sig} |\n")

    f.write("\n## Full Demographic Coefficients\n\n")
    f.write("| Model | Variable | Coef | SE | p-value | Sig |\n")
    f.write("|---|---|---|---|---|---|\n")
    for label, gls, vnames in [
        ("G1: Baseline", gls_g1, z_vars),
        ("G2: + Year FE", gls_g2, z_vars),
        ("G3: Country FE + Year FE", gls_g3, dm_z)
    ]:
        if gls:
            for j, v in enumerate(vnames):
                display_v = v.replace('_dm', '') if '_dm' in v else v
                sig = fmt_sig(gls.pvalues[j])
                f.write(f"| {label} | {display_v} | {gls.beta[j]:.4f} | {gls.se[j]:.4f} | {gls.pvalues[j]:.4f} | {sig} |\n")

print(f"\n→ Saved: {OUT_DIR / 'referee_fe_robustness.md'}")

print("\n" + "="*70)
print("ALL PARTS COMPLETE")
print("="*70)
print(f"\nOutput files:")
for fname in ['referee_log_audit.md', 'referee_bootstrap_check.md', 'referee_balanced_patents.md',
              'referee_efficiency_audit.md', 'referee_health_paths.md',
              'referee_fdi_winsorized.md', 'referee_fe_robustness.md']:
    print(f"  {OUT_DIR / fname}")
